### FIGURE 1 ###
library(ggplot2)
library(RColorBrewer)
library(dplyr)
library(tidyverse)
library(ggrepel)
library(cowplot)
library(ggsignif)

### Loading data - Panel B
path1<-'cluster_defsys_count.tsv'
cluster<- read.csv(path1,sep="\t", header=FALSE,col.names=c("type","n"))
### Loading data - Panel C
path2 <- 'defsys_strains_count.tsv'
strains<- read.csv(path2,sep='\t', header=FALSE, col.names = c("type","count"))
### Loading data - Panel D
path3<-paste0('subtypes_count.tsv')
subtype<- read.csv(path3,sep="\t", header=TRUE)

# Filter Cluster - Panel B
cluster_per <- cluster %>% mutate(per= round(n/sum(n)*100,2)) # Calculate percentage
cluster_filter <- cluster_per[cluster_per$per < 1,] # Filter
cluster_filter_types <- cluster_filter$type # Types removal list
for(i in cluster_filter_types){ cluster_per <- replace(cluster_per, cluster_per == i, "Other")} # Replace types made up with fewer gene cluster with 'Other'
cluster_other <- cluster_per %>% group_by(type) %>% summarise(across(c(n,per),sum)) # Join 'Other'

# Filter Strains - Panel C
strains_per <- strains %>% mutate(per= round(count/8929*100,2)) # nºstrains=8929  # Calculate percentage
strains_filter <- strains_per[strains_per$per < 1,] # Filter
strains_filter_types <- strains_filter$type # Types removal list
for(i in strains_filter_types){ strains_per <- replace(strains_per, strains_per == i, "Other")} # Replace low frequency type with 'Other'
strains_other <- strains_per %>% group_by(type) %>% summarise(across(c(count,per),sum)) # Join 'Other'


cluster_def <- subset(cluster_other, type != "type") # Avoid 'type' in our data
strains_def <- strains_other[!grepl('type',strains_other$type),] # Avoid 'type' in our data
cluster_def$type <- factor(cluster_def$type,levels=cluster_def$type[order(cluster_def$n,decreasing=F)]) # Order in a decreasing way
strains_def$type <- factor(strains_def$type,levels=strains_def$type[order(strains_def$count,decreasing=F)]) # Order in a decreasing way


# Colors
colors = data.frame(Types=c("R-M", "SspBCDE","Gao_Qat","CBASS","Cas","Septu","Lamassu-Fam","PD-T4-5", "PD-T7-5","Gabija","Retron",
                            "RosmerTA","Rst_PARIS","BREX","AVAST","NLR","Mokosh","Zorya","dGTPase","DRT","RloC","SEFIR","PD-Lambda-2","Shedu",
                            "DarTG","dCTPdeaminase","CapRel", "AbiH", "SanaTA","PD-Lambda-5", "Druantia","Other", "PrrC", "Menshen","Thoeris","Hachiman","Dnd"), Colors=
                      c("#BC808D", "#FB8072","#FDB462","#8DD3C7","#BEBADA","#80B1D3","#377EB8","#A6D854", "#4DAF4A","#FFFFB3","#FCCDE5",
                        "#E78AC3","#E5C494","#fbb4a3","#b3cde3","#ccebc5","#decbe4","#fed9a6","#ffffcc","#e5d8bd","#fddaec","#ffed6f","#7fc97f","darkslategray1",
                        "#beaed4","lightgoldenrod","#fdc086", "#ffff99", "#386cb0","#66a61e", "#1b9e77","#b3b3b3","lightgoldenrod","#fdc086", "#386cb0","#66a61e", "#1b9e77"))



# Draw plots - Panel B & C
B <- ggplot(cluster_def,aes(type,n,fill=type))+geom_bar(stat='identity', show.legend = FALSE)+geom_text(data=cluster_def, aes(y=n,label=paste0(per,"%")), size=5.5,vjust=0.5,hjust=-0.05)+
  labs(x="Defense systems types", y="No. genes in pangenome")+coord_flip()  +theme_minimal() + theme(axis.text.y = element_text(size = 15),axis.text.x=element_text(size= 14), axis.title=element_text(size = 12)) +scale_fill_manual(values=setNames(colors$Colors, colors$Types)) + ylim(0,700)

C <- ggplot(strains_def,aes(type,count,fill=type))+geom_bar(stat='identity', show.legend = FALSE)+geom_text(data=strains_def, aes(y=count,label=paste0(per,"%")), size=5.5,vjust=0.5,hjust=-0.05)+
  labs(y="No. of genomes")+coord_flip() + theme_minimal() + theme(axis.title.y=element_blank(),axis.text.y = element_text(size = 15),axis.text.x=element_text(size= 14), axis.title=element_text(size=12)) +scale_fill_manual(values=setNames(colors$Colors, colors$Types)) + ylim(0,6500)


# Processing Subtypes - Panel D
subtype$result = ifelse(subtype$type != subtype$subtype, 'Dif', 'Same') #Check the subtypes different from types 
types_dif <- subset(subtype, result=="Dif") 
types_dif$result <- NULL # Remove column 'result'
count_types <- types_dif%>%group_by(type,subtype)%>% tally() # Count only those types with different subtypes

# Filte Panel D
count_types <- subset(count_types, type %in% cluster$type)
# Identifying the rows having more than one occurrence
duplicated_types <- count_types$type[duplicated(count_types$type) | duplicated(count_types$type, fromLast = TRUE)]
# Remain only the types with more than one subtypes
count_types2 <- count_types[count_types$type %in% duplicated_types, ]
types_heterog <- c("CBASS","Rst_PARIS","Retron","R-M") # List of types with greater diversity of subtypes
count_types_filter <- count_types2[count_types2$type %in% types_heterog,] 
subtype_out <- c('Retron_VII_2','Retron_V', 'Retron_II','Retron_XII') # Removing subtypes with a low frequence of appearence
count_types_filter <- count_types_filter %>% filter(!(subtype %in% subtype_out))
count_types_def <- count_types_filter %>% mutate(per = n/sum(n)*100) # Calculate percentege
count_types_def <- count_types_def %>%  arrange(desc(per)) # Sorting


# Colors 
colors_subtypes <- data.frame(Subtypes = c("CBASS_I","CBASS_II","CBASS_III","PARIS_I","PARIS_II","PARIS_II_merge",
                                           "Retron_I_A","Retron_I_C","Retron_III","Retron_IV","Retron_VI","R-M_Type_I","R-M_Type_II","R-M_Type_IIG","R-M_Type_III","R-M_Type_IV"),
                              Colors = c("#7FFFD4","#66CDAA","#458B74","#FFD39B","#CDAA7D","#8B7355","#CD8C95","#EEA2AD","#FFC0CB","#FFAEB9","#FF82AB","#8B475D","#8B4789","#CD69C9","#CD00CD","#8B008B"))

D <- ggplot(count_types_def, aes(fill= subtype, y= per, x= type)) +geom_bar(stat='identity')+
  theme_minimal() + theme(text = element_text(size=13),axis.text.x = element_text(size=12, hjust = 0.5,vjust=4), legend.key.size = unit(6, 'mm'), legend.title= element_blank(), legend.text = element_text(size = 12)) +geom_text(aes(label=n),size = 5, position=position_stack(vjust=.5)) + 
  scale_fill_manual(values=setNames(colors_subtypes$Colors, colors_subtypes$Subtypes))+###"position_stack" and "position_fill" arguments help to stack values in different orders in the stacked barplots
  labs(x= "Defense systems types", y="Percentage strains (%)", fill= "Defense Systems Subtypes")  + guides(fill=guide_legend(ncol=2)) #Force legend into two columns


# Join panels
library(ggplotify)
BC<-plot_grid(B,C,labels=c('B','C'), ncol=2,rel_widths = c(0.4,0.4))
empty<- ggplot()+theme_minimal()
BCD<- plot_grid(BC,empty,D, labels=c("","D",""), nrow=3, ncol= 1, rel_heights=c(3,0.2,1.7))

print(BCD)
ggsave("fig1_BCD.svg", width = 13,height=10,dpi=300, device="svg")